This code chunk is where we load in all of the packages that we will use in this script using library(packagename)

knitr::opts_chunk$set(echo = TRUE)
library(jsonlite) # allows us to read in json files
library(tidyverse) # allows us to do lots of data manipulation and basic data science
library(here) # allows us to cut out long file paths (ex. "users/connor/dowloads/etc")
library(forcats) # 
library(tidytext) # allows us to tokenize data 
library(dplyr) # allows us to manipulate dataframes
library(stringr) # allows us to count the number of words in a cell
library(quanteda) # allows us to tokenize data
library(quanteda.textplots) # allows us to make network plots
library(gridExtra) # allows us to combine multiple plots into 1
library(wordcloud) # allows us to generate word clouds

Here is where we read in out data

After you run the chunk below successfully, a dataframe called climate_fever should appear in your environment tab

climate_fever <- stream_in(file(here("data/climate-fever-dataset-r1.jsonl")))
## 
 Found 500 records...
 Found 1000 records...
 Found 1500 records...
 Found 1535 records...
 Imported 1535 records. Simplifying...

Open the dataframe and look at the claim_label column

unique(climate_fever$claim_label)
## [1] "SUPPORTS"        "REFUTES"         "NOT_ENOUGH_INFO" "DISPUTED"

Once we use the unique function unique(dataframe_name$dataframe_column) to find the unique values in the dataframe, we can see there are 4 unique values

What are the proportions of each of these values in the column?

table(climate_fever$claim_label)
## 
##        DISPUTED NOT_ENOUGH_INFO         REFUTES        SUPPORTS 
##             154             474             253             654

Looks like the most common value in this column is SUPPORTS and the leat common value is DISPUTED

Let’s make a bar graph of this information

ggplot(data = climate_fever, aes(x = claim_label)) +
  geom_bar()

Here is our information visualized, congrats on your first plot of SPICE!!

Let’s make this a little bit nicer

ggplot(data = climate_fever, aes(x = fct_rev(fct_infreq(claim_label))))+
  geom_bar(fill = "steelblue", color = "black") +
  theme_minimal() +
  coord_flip() +
  labs(y = "Number of Claims", x = "Type of Claim", title = "Number of Claims by Type - Climate Fever")

Now, look at the column claim

climate_fever <- climate_fever %>% 
  mutate(word_count = str_count(climate_fever$claim, "\\S+"))

Now look at the climate_fever dataframe and there should be a new column called word_count

With this column we can now plot the frequency of the number of words in a claim with a histogram

To practice add a title, x axis label, and y axis label to this plot

ggplot(climate_fever, aes(x = word_count)) +
  geom_histogram(bins = 67, fill = "steelblue", color = "black") +
  theme_minimal()

Does the number of words in a claim correlate with the claim label?

Let’s color the plot by claim_label to find out

ggplot(climate_fever, aes(x = word_count, fill = claim_label)) +
  geom_histogram(bins = 67, color = "black") +
  theme_minimal()

Now let’s look more into the specific language of the claims

Tokenize - break the claims down by word (~31,000 words)

climate_fever_tokenized <- climate_fever%>% 
  unnest_tokens(word, claim)

Count the number of times each word appears

climate_fever_tokenized <- climate_fever_tokenized %>% 
  count(word) %>% 
  arrange(desc(n))

We see lots of prepositions at the top of the list - Since these words don’t have much meaning, let’s filter them out - There’s many way to do this - Here is the most basic:

Create a stop words object of words that don’t hold much meaning to our analysis - This is just a few

stop_word_list <- c("the", "of", "a", "is", "in", "to", "and", "that", "from", "on", "with", "by", "for")

Filter the dataset for words that are not in the stop list

climate_fever_tokenized <- climate_fever_tokenized %>% 
  filter(!word %in% stop_word_list)

Create a Word Cloud

wordcloud(words = climate_fever_tokenized$word, freq = climate_fever_tokenized$n, min.freq = 5,           max.words=200, random.order=FALSE, rot.per=0.35,            colors=brewer.pal(8, "Dark2"))

Now, let’s look further into how the words in these claims are used together by creating a network of feature co-occurences

Whole Dataset

climate_fever_corpus <- corpus(climate_fever$claim)

toks <- climate_fever_corpus %>%
    tokens(remove_punct = TRUE) %>%
    tokens_tolower() %>%
    tokens_remove(pattern = stopwords("english"), padding = FALSE)


fcmat <- fcm(toks, context = "window", tri = FALSE)

feat <- names(topfeatures(fcmat, 30))

fcm_select(fcmat, pattern = feat) %>%
    textplot_network(min_freq = 0.5)

The network plot above gives us more insight into how words are being used together

Would a network plot subset on the claims that refute climate change look different than this?

Refuting Claims

climate_fever_refute <- climate_fever %>% 
  filter(claim_label == "REFUTES")

climate_fever_refute_corpus <- corpus(climate_fever_refute$claim)

toks_refute <- climate_fever_refute_corpus %>%
    tokens(remove_punct = TRUE) %>%
    tokens_tolower() %>%
    tokens_remove(pattern = stopwords("english"), padding = FALSE)
fcmat_refute <- fcm(toks_refute, context = "window", tri = FALSE)
feat_refute <- names(topfeatures(fcmat_refute, 30))
network_refute <- fcm_select(fcmat_refute, pattern = feat_refute) %>%
    textplot_network(min_freq = 0.5)
network_refute

Let’s do another network plot only for the claims that support climate change

Supporting Claims

climate_fever_support <- climate_fever %>% 
  filter(claim_label == "SUPPORTS")

climate_fever_support_corpus <- corpus(climate_fever_support$claim)

toks_support <- climate_fever_support_corpus %>%
    tokens(remove_punct = TRUE) %>%
    tokens_tolower() %>%
    tokens_remove(pattern = stopwords("english"), padding = FALSE)
fcmat_support <- fcm(toks_support, context = "window", tri = FALSE)
feat_support <- names(topfeatures(fcmat_support, 30))
network_support <- fcm_select(fcmat_support, pattern = feat_support) %>%
    textplot_network(min_freq = 0.5)
network_support

To more easily compare the 2 network plots (support claims vs refute claims), let’s put them both on the same plot using grid.arrange

grid.arrange(network_support, network_refute, ncol=2)

Few takeaways:

Now let’s look deeper into the lingustics of the claims with Ngrams

climate_fever_claims <- climate_fever %>% 
  select(claim)

ngrams <- climate_fever_claims %>% 
  unnest_tokens(bigram, claim, token = "ngrams", n = 2)


ngrams <- ngrams %>% 
 separate(bigram, c("word1", "word2"), sep = " ") 

ngrams <- ngrams %>%
  filter(!word1 %in% stop_words$word) %>%
      filter(!word2 %in% stop_words$word)


ngrams <- ngrams %>%
  unite(bigram, word1, word2, sep=" ")

ngrams_counts <- ngrams %>% 
  count(bigram, sort = TRUE)

head(ngrams_counts)
##           bigram   n
## 1 global warming 193
## 2 climate change 128
## 3 carbon dioxide  81
## 4      sea level  73
## 5     level rise  44
## 6        sea ice  44

4Grams

ngrams4 <- climate_fever_claims %>% 
  unnest_tokens(fourgram, claim, token = "ngrams", n = 4)


ngrams4 <- ngrams4 %>% 
 separate(fourgram, c("word1", "word2", "word3", "word4"), sep = " ") 

ngrams4 <- ngrams4 %>%
  filter(!word1 %in% stop_words$word) %>%
      filter(!word2 %in% stop_words$word) %>% 
        filter(!word3 %in% stop_words$word) %>% 
          filter(!word4 %in% stop_words$word)


ngrams4 <- ngrams4 %>%
  unite(fourgram, word1, word2, word3, word4, sep=" ")

ngrams4_count <- ngrams4 %>% 
  count(fourgram, sort = TRUE)

head(ngrams4_count)
##                             fourgram n
## 1        human caused global warming 6
## 2 atmospheric carbon dioxide content 3
## 3              global sea level rise 3
## 4           medieval warm period mwp 3
## 5         sea level rise predictions 3
## 6        58 peer reviewed scientific 2